ASER Pakistan 2016

In this piece of paper, a set of data obtained from Annual Status of Education Report (ASER) is explored. The raw data was downloaded from the link here. https://palnetwork.org/aser-centre/

Preparation

Packages Used

library(tidyverse)
library(ggplot2)
library(gghighlight)
library(stringr)
library(ggmap)
library(dplyr)
library(cartography)
library(sf)
library(tmap)
library(spData)
library(maps)
library(mapdata)
library(maptools)
library(ggthemes)
library(choroplethr)
library(choroplethrAdmin1)
library(choroplethrMaps)
library(rgdal)
# library(choroplethrZip)
# library(spDataLarge)
# install.packages("spDataLarge") not available

Data Installation

read.csv("aser/ASER2016GSchool.csv")
school <- read.csv("aser/ASER2016GSchool.csv")
child <- read.csv("aser/ASER2016Child.csv")
RegionName <- c("2" = "Panjab", 
                "3" = "Sindh", 
                "4" = "Balochistan", 
                "5" = "Khyber Pakhtunkhwa", 
                "6" = "Gilgit-Baltistan", 
                "7" = "Azad Jammu and Kashmir", 
                "8" = "Islamabad - ICT", 
                "9" = "Federally Administrated Tribal Areas")
Gender <- c("0" = "Male",
            "-1" = "Female")

Exploration

Checking Samplesizes

length(unique(child$CID))
## [1] 255196

The whole samplesize (the numebr of children) of this dataset is 255196.

child %>% 
  filter(DID == 266) %>% 
  summarize(N_hunza = length(unique(CID)))

The samplesize of Hunza alone is 1641.

Exploration in Hunza

Gender Proportion

child %>% 
  filter(DID == 266) %>% 
  summarize(gender_proportion = mean(C002))

-1: female, 0: male
gender_proportion = -0.5173675 means there are a little more girls in the dataset.

Age of Children (C001)

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C001)) +
  geom_histogram()

Age is well sparsed

Eduation Status

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C003)) +
  geom_histogram(bins = 3)

1 = never enrolled; 2 = drop-out; 3 = currently enrolled

Education Status by Gender

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C003)) +
  geom_histogram(bins = 3, binwidth = 1) +
  facet_grid(~C002, labeller = labeller(C002 = Gender))

Both genders look pretty good interms of the absolute number of currently-enrolled-children

The Enrollment Rate by Gender

child %>% 
  filter(DID == 266) %>% 
  group_by(C002) %>% 
  summarize(enrollment_rate = mean(C003 == 3)) %>% 
  ungroup() %>% 
  ggplot(aes(C002, enrollment_rate)) +
  geom_col() +
  scale_y_continuous() +
  geom_label(aes(label = enrollment_rate)) +
  scale_x_continuous(breaks = c(-1, 0), labels = c("Female", "Male"))

As a rate, both are doing pretty good

Currently-Enrolled: Institution Type (C006)

1 = Government; 2 = Private; 3 = Madrasah(Conventional religious education) School; 4 = Other(Non formal education facility)

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C006)) +
  geom_histogram()

Most children go to public schools or private schools

Within Gilgit-Baltistan

child %>% 
  filter(RID == 6) %>% 
  group_by(DID) %>% 
  mutate(Current_Enrollment_Rate = mean(C003 == 3)) %>% 
  ggplot(aes(DID, Current_Enrollment_Rate)) +
  geom_count() +
  scale_x_continuous(breaks = 260:266, labels = c("Gilgit", "Diamer", "Skardu", "Ghanshe", "Astore", "Ghizer", "Hunza-Nagar"))

Within Gilgit-Baltistan, Hunza is outperforming.

child %>% 
  filter(RID == 6) %>% 
  group_by(DID) %>% 
  ggplot(aes(DID, C010)) +
  geom_boxplot(aes(group = DID)) +
  scale_x_continuous(breaks = 260:266, labels = c("Gilgit", "Diamer", "Skardu", "Ghanshe", "Astore", "Ghizer", "Hunza-Nagar"))

Basic Learning Levels: Reading in Local/National Language (C010)

1 = Begginer/Nothing; 2 = Letters; 3 = Words; 4 = Sentences; 5 = Story

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C010)) +
  geom_histogram()

child %>% 
  filter(DID == 266) %>% 
  summarize(na = sum(is.na(C010)))

Basic Learning Levels by Age

child %>% 
  filter(DID == 266, C013 != c(3,4)) %>% 
  ggplot(aes(C010)) +
  geom_histogram() +
  facet_grid(~C001)

# children at he age of 3 and 4 are removed for they have not data

Basic Learning Levels by Gender

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C010)) +
  geom_histogram() +
  facet_grid(~C002, labeller = labeller(C002 = Gender))

English Reading Levels (C013)

child %>% 
  filter(DID == 266, C013 != c(3,4)) %>% 
  ggplot(aes(C013)) +
  geom_histogram() +
  facet_grid(~C001)

# children at he age of 3 and 4 are removed for they have not data

English Reading Levels by Gender

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C013)) +
  geom_histogram() +
  facet_grid(~C002, labeller = labeller(C002 = Gender))

Comparison between Other Region

Current Enrollment Rate

child %>% 
  group_by(DID) %>% 
  mutate(avg = round(mean(C003 == 3), digits = 2)) %>% 
  ungroup() %>% 
  ggplot(aes(avg)) +
  geom_histogram() +
  facet_grid(~RID, labeller = labeller(RID = RegionName)) +
  labs(title = "Current Enrollment Rate by Region")

Female Average Learning Levels (Age or other variables are NOT adjusted)

child %>% 
  filter(C002 == -1) %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_learning, color = RID)) +
  geom_point() +
  geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE)

child %>% 
  filter(C002 == -1) %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_learning, color = RID)) +
  geom_point() +
  geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE) +
  gghighlight(RID == 6)

It is interesting to note that Gilgit-Baltistan(RID==6) has a huge diversity in average learning levels of girls and Hunza(DID==266) is in the top group of all region.

Spatial Analysis

Pakistan Map

data("admin1.map")
pak <- subset(admin1.map, admin == "pakistan")
region <- pak$region

ggplot() +
  geom_polygon(data = pak, aes(long, lat, group = group),
               fill = "white", color = "black") +
  geom_point(data = map, aes(long, lat, label = name)) +
  geom_text(data = map, aes(long, lat, label = name), check_overlap = TRUE, nudge_y = 1) +
  coord_fixed()

<<<<<<< HEAD

Districts Map

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica, aes(x, y, label = C003, color = C003)) +
  geom_text(data = child_ica, aes(x, y, label = C003), check_overlap = TRUE, nudge_y = 1)
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
## Warning: Removed 8541 rows containing missing values (geom_text).

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica, aes(x, y, label = C010, color = C010))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).

  # geom_text(data = child_ica, aes(x, y, label = C010), check_overlap = TRUE, nudge_y = 1)
=======

Districts Map

ica_df_3 <- ica_df %>% filter(Province == "sindh")

ica_df_3$Districts <- ica_df_3$Districts %>%  
  str_replace("ghotki", "gotki") %>%
  str_replace("mirpur khas", "mirpurkhas") %>% 
  str_replace("malir karachi", "karachi-malir-rural") %>% 
  str_replace("naushahro feroze", "nowshero feroze") %>% 
  str_replace("kambar shahdad kot", "qambar shahdadkot") %>% 
  str_replace("sujawal", "sajawal") %>% 
  str_replace("shaheed benazir abad", "shaheed benazirabad") %>% 
  str_replace("tando allahyar", "tando allah yar") %>% 
  as.vector()

child_dname_3 <- child_dname %>% filter(RNAME == "Sindh") %>% left_join(ica_df_3, by = c("dname" = "Districts"))

child_dname_3 %>% group_by(dname) %>% summarize(n = sum(x))
## `summarise()` ungrouping output (override with `.groups` argument)
ica_df_3
ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica, aes(x, y, label = C003, color = C003)) +
  geom_text(data = child_ica, aes(x, y, label = C003), check_overlap = TRUE, nudge_y = 1)
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
## Warning: Removed 8541 rows containing missing values (geom_text).

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica, aes(x, y, label = C010, color = C010))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).

  # geom_text(data = child_ica, aes(x, y, label = C010), check_overlap = TRUE, nudge_y = 1)
>>>>>>> 9f90d2d5e867df4fc1eef511962c1bee40300bcc

Traial: children, gender

child_ica %>% group_by(DID) %>% 
  mutate(gender_ratio = mean(C002))
ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica %>% group_by(DID) %>% 
  mutate(gender_ratio = mean(C002)), aes(x, y, label = gender_ratio, color = gender_ratio))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
<<<<<<< HEAD

0: male, -1: female

=======

>>>>>>> 9f90d2d5e867df4fc1eef511962c1bee40300bcc